import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
# to visulize the weekdays
import plotly.express as px
#shift +tab is used to get the function details and auto sugesstion
import seaborn as sns
files = os.listdir(r"G:\Tutorials Notes\UdemyCourses\DataAnalysis_Projects\Uber_NewYork_DA\DataFiles\drive-download-20210609T171232Z-001")
#r is to show the exact address of the data location
# for the last 7 files only
files = files[-7:]
files
['uber-raw-data-apr14.csv', 'uber-raw-data-aug14.csv', 'uber-raw-data-janjune-15.csv', 'uber-raw-data-jul14.csv', 'uber-raw-data-jun14.csv', 'uber-raw-data-may14.csv', 'uber-raw-data-sep14.csv']
# have to remove a file,list
files.remove('uber-raw-data-janjune-15.csv')
files
['uber-raw-data-apr14.csv', 'uber-raw-data-aug14.csv', 'uber-raw-data-jul14.csv', 'uber-raw-data-jun14.csv', 'uber-raw-data-may14.csv', 'uber-raw-data-sep14.csv']
# now to concatinate the data of multiple files
path = r'G:\Tutorials Notes\UdemyCourses\DataAnalysis_Projects\Uber_NewYork_DA\DataFiles\drive-download-20210609T171232Z-001'
final = pd.DataFrame()
for file in files:
df = pd.read_csv(path+ "/" + file, encoding = 'utf-8')
final = pd.concat([final, df])
final.shape
(4534327, 4)
df = final
df.head()
| Date/Time | Lat | Lon | Base | |
|---|---|---|---|---|
| 0 | 4/1/2014 0:11:00 | 40.7690 | -73.9549 | B02512 |
| 1 | 4/1/2014 0:17:00 | 40.7267 | -74.0345 | B02512 |
| 2 | 4/1/2014 0:21:00 | 40.7316 | -73.9873 | B02512 |
| 3 | 4/1/2014 0:28:00 | 40.7588 | -73.9776 | B02512 |
| 4 | 4/1/2014 0:33:00 | 40.7594 | -73.9722 | B02512 |
df.dtypes
Date/Time object Lat float64 Lon float64 Base object dtype: object
df['Date/Time'].head()
0 4/1/2014 0:11:00 1 4/1/2014 0:17:00 2 4/1/2014 0:21:00 3 4/1/2014 0:28:00 4 4/1/2014 0:33:00 Name: Date/Time, dtype: object
#have to convert the first coloum into date time object
df['Date/Time'] = pd.to_datetime(df['Date/Time'],format = '%m/%d/%Y %H:%M:%S')
df.dtypes
Date/Time datetime64[ns] Lat float64 Lon float64 Base object dtype: object
df.head()
| Date/Time | Lat | Lon | Base | |
|---|---|---|---|---|
| 0 | 2014-04-01 00:11:00 | 40.7690 | -73.9549 | B02512 |
| 1 | 2014-04-01 00:17:00 | 40.7267 | -74.0345 | B02512 |
| 2 | 2014-04-01 00:21:00 | 40.7316 | -73.9873 | B02512 |
| 3 | 2014-04-01 00:28:00 | 40.7588 | -73.9776 | B02512 |
| 4 | 2014-04-01 00:33:00 | 40.7594 | -73.9722 | B02512 |
df['day'] = df['Date/Time'].dt.day
df['minute'] = df['Date/Time'].dt.minute
df['month'] = df['Date/Time'].dt.month
df['year'] = df['Date/Time'].dt.year
| Date/Time | Lat | Lon | Base | day | minute | month | year | |
|---|---|---|---|---|---|---|---|---|
| 0 | 2014-04-01 00:11:00 | 40.7690 | -73.9549 | B02512 | 1 | 11 | 4 | 2014 |
| 1 | 2014-04-01 00:17:00 | 40.7267 | -74.0345 | B02512 | 1 | 17 | 4 | 2014 |
| 2 | 2014-04-01 00:21:00 | 40.7316 | -73.9873 | B02512 | 1 | 21 | 4 | 2014 |
| 3 | 2014-04-01 00:28:00 | 40.7588 | -73.9776 | B02512 | 1 | 28 | 4 | 2014 |
| 4 | 2014-04-01 00:33:00 | 40.7594 | -73.9722 | B02512 | 1 | 33 | 4 | 2014 |
df['hour'] = df['Date/Time'].dt.hour
df['weekday'] = df['Date/Time'].dt.day_name()
| Date/Time | Lat | Lon | Base | day | minute | month | year | weekday | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 2014-04-01 00:11:00 | 40.7690 | -73.9549 | B02512 | 1 | 11 | 4 | 2014 | Tuesday |
| 1 | 2014-04-01 00:17:00 | 40.7267 | -74.0345 | B02512 | 1 | 17 | 4 | 2014 | Tuesday |
| 2 | 2014-04-01 00:21:00 | 40.7316 | -73.9873 | B02512 | 1 | 21 | 4 | 2014 | Tuesday |
| 3 | 2014-04-01 00:28:00 | 40.7588 | -73.9776 | B02512 | 1 | 28 | 4 | 2014 | Tuesday |
| 4 | 2014-04-01 00:33:00 | 40.7594 | -73.9722 | B02512 | 1 | 33 | 4 | 2014 | Tuesday |
df.head()
| Date/Time | Lat | Lon | Base | day | minute | month | year | weekday | hour | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2014-04-01 00:11:00 | 40.7690 | -73.9549 | B02512 | 1 | 11 | 4 | 2014 | Tuesday | 0 |
| 1 | 2014-04-01 00:17:00 | 40.7267 | -74.0345 | B02512 | 1 | 17 | 4 | 2014 | Tuesday | 0 |
| 2 | 2014-04-01 00:21:00 | 40.7316 | -73.9873 | B02512 | 1 | 21 | 4 | 2014 | Tuesday | 0 |
| 3 | 2014-04-01 00:28:00 | 40.7588 | -73.9776 | B02512 | 1 | 28 | 4 | 2014 | Tuesday | 0 |
| 4 | 2014-04-01 00:33:00 | 40.7594 | -73.9722 | B02512 | 1 | 33 | 4 | 2014 | Tuesday | 0 |
# to see the counts
df['weekday'].value_counts()
Thursday 755145 Friday 741139 Wednesday 696488 Tuesday 663789 Saturday 646114 Monday 541472 Sunday 490180 Name: weekday, dtype: int64
#bar bar plot
px.bar(x=df['weekday'].value_counts().index ,y = df['weekday'].value_counts())
# so rush is higher on thursday and friday
# to plot the histogram
plt.hist(df['hour'])
# it means during the eveing time , they get most rides
(array([216928., 103517., 227152., 543565., 324851., 366329., 819491.,
660869., 579117., 692508.]),
array([ 0. , 2.3, 4.6, 6.9, 9.2, 11.5, 13.8, 16.1, 18.4, 20.7, 23. ]),
<BarContainer object of 10 artists>)
# tpo view the rides for each unique months
df['month'].unique()
array([4, 8, 7, 6, 5, 9], dtype=int64)
# to iterate and vizulise for each month, we use the subplot
for i,month in enumerate(df['month'].unique()):
print(i)
print(month)
0 4 1 8 2 7 3 6 4 5 5 9
# to plot for each month
plt.figure(figsize = (40,20))
for i,month in enumerate(df['month'].unique()):
plt.subplot(3,2,i+1)
df[df['month']==month]['hour'].hist()
#by this we can see that for each month uber get the max rides on evening hours
import chart_studio.plotly as py
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot,iplot
# to visulize in a much better way
df.groupby('month')['hour'].count()
month 4 564516 5 652435 6 663844 7 796121 8 829275 9 1028136 Name: hour, dtype: int64
#Q = which month has maximum rides
trace1 = go.Bar(x = df.groupby('month')['hour'].count().index, y = df.groupby('month')['hour'].count(), name = 'Priority')
iplot([trace1])
# So September has the maximum rrides
sns.displot(df['day'])
<seaborn.axisgrid.FacetGrid at 0x220e76f3df0>
# Analysis of Journer on each day
plt.figure(figsize=(10,8))
plt.hist(df['day'], bins = 30, rwidth = 0.8, range= (0.5, 30.5)) # adding range makes the difference
plt.xlabel('date of the months')
plt.ylabel('Total journey')
plt.title('Journey BY Month Day')
Text(0.5, 1.0, 'Journey BY Month Day')
#@ Analysis of total rides month wise
plt.figure(figsize = (20,8))
for i,month in enumerate(df['month'].unique(),1):
plt.subplot(3,2,i)
df_out = df[df['month']==month]
plt.hist(df_out['day'])
plt.xlabel('days in month {}'.format(month))
plt.ylabel('total_rides')
#Q- Analysing rush in hours
# simplar to the line plot
ax = sns.pointplot(x = 'hour', y='Lat',data = df, hue = 'weekday')
ax.set_title('HoursOfDay vs Latitue of Passenger')
Text(0.5, 1.0, 'HoursOfDay vs Latitue of Passenger')
#Q = analysis which base number gets popular by month name
df.head()
| Date/Time | Lat | Lon | Base | day | minute | month | year | weekday | hour | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2014-04-01 00:11:00 | 40.7690 | -73.9549 | B02512 | 1 | 11 | 4 | 2014 | Tuesday | 0 |
| 1 | 2014-04-01 00:17:00 | 40.7267 | -74.0345 | B02512 | 1 | 17 | 4 | 2014 | Tuesday | 0 |
| 2 | 2014-04-01 00:21:00 | 40.7316 | -73.9873 | B02512 | 1 | 21 | 4 | 2014 | Tuesday | 0 |
| 3 | 2014-04-01 00:28:00 | 40.7588 | -73.9776 | B02512 | 1 | 28 | 4 | 2014 | Tuesday | 0 |
| 4 | 2014-04-01 00:33:00 | 40.7594 | -73.9722 | B02512 | 1 | 33 | 4 | 2014 | Tuesday | 0 |
base = df.groupby(['Base','month'])['Date/Time'].count().reset_index()
base
| Base | month | Date/Time | |
|---|---|---|---|
| 0 | B02512 | 4 | 35536 |
| 1 | B02512 | 5 | 36765 |
| 2 | B02512 | 6 | 32509 |
| 3 | B02512 | 7 | 35021 |
| 4 | B02512 | 8 | 31472 |
| 5 | B02512 | 9 | 34370 |
| 6 | B02598 | 4 | 183263 |
| 7 | B02598 | 5 | 260549 |
| 8 | B02598 | 6 | 242975 |
| 9 | B02598 | 7 | 245597 |
| 10 | B02598 | 8 | 220129 |
| 11 | B02598 | 9 | 240600 |
| 12 | B02617 | 4 | 108001 |
| 13 | B02617 | 5 | 122734 |
| 14 | B02617 | 6 | 184460 |
| 15 | B02617 | 7 | 310160 |
| 16 | B02617 | 8 | 355803 |
| 17 | B02617 | 9 | 377695 |
| 18 | B02682 | 4 | 227808 |
| 19 | B02682 | 5 | 222883 |
| 20 | B02682 | 6 | 194926 |
| 21 | B02682 | 7 | 196754 |
| 22 | B02682 | 8 | 173280 |
| 23 | B02682 | 9 | 197138 |
| 24 | B02764 | 4 | 9908 |
| 25 | B02764 | 5 | 9504 |
| 26 | B02764 | 6 | 8974 |
| 27 | B02764 | 7 | 8589 |
| 28 | B02764 | 8 | 48591 |
| 29 | B02764 | 9 | 178333 |
plt.figure(figsize=(20,6))
sns.lineplot(x='month',y='Date/Time',hue='Base', data = base)
<AxesSubplot:xlabel='month', ylabel='Date/Time'>
#Q performe cross analysis
#1= HeatMap by hour and weekday
#2 = HeatMap by hour and day
#3 = HeatMap by Month and day
#4 = HeatMap by Month and weekday
def count_rows(rows):
return len(rows)
by_cross = df.groupby(['weekday','hour']).apply(count_rows)
by_cross
weekday hour
Friday 0 13716
1 8163
2 5350
3 6930
4 8806
...
Wednesday 19 47017
20 47772
21 44553
22 32868
23 18146
Length: 168, dtype: int64
pivot = by_cross.unstack()
pivot
| hour | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | ... | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| weekday | |||||||||||||||||||||
| Friday | 13716 | 8163 | 5350 | 6930 | 8806 | 13450 | 23412 | 32061 | 31509 | 25230 | ... | 36206 | 43673 | 48169 | 51961 | 54762 | 49595 | 43542 | 48323 | 49409 | 41260 |
| Monday | 6436 | 3737 | 2938 | 6232 | 9640 | 15032 | 23746 | 31159 | 29265 | 22197 | ... | 28157 | 32744 | 38770 | 42023 | 37000 | 34159 | 32849 | 28925 | 20158 | 11811 |
| Saturday | 27633 | 19189 | 12710 | 9542 | 6846 | 7084 | 8579 | 11014 | 14411 | 17669 | ... | 31418 | 38769 | 43512 | 42844 | 45883 | 41098 | 38714 | 43826 | 47951 | 43174 |
| Sunday | 32877 | 23015 | 15436 | 10597 | 6374 | 6169 | 6596 | 8728 | 12128 | 16401 | ... | 28151 | 31112 | 33038 | 31521 | 28291 | 25948 | 25076 | 23967 | 19566 | 12166 |
| Thursday | 9293 | 5290 | 3719 | 5637 | 8505 | 14169 | 27065 | 37038 | 35431 | 27812 | ... | 36699 | 44442 | 50560 | 56704 | 55825 | 51907 | 51990 | 51953 | 44194 | 27764 |
| Tuesday | 6237 | 3509 | 2571 | 4494 | 7548 | 14241 | 26872 | 36599 | 33934 | 25023 | ... | 34846 | 41338 | 48667 | 55500 | 50186 | 44789 | 44661 | 39913 | 27712 | 14869 |
| Wednesday | 7644 | 4324 | 3141 | 4855 | 7511 | 13794 | 26943 | 36495 | 33826 | 25635 | ... | 35148 | 43388 | 50684 | 55637 | 52732 | 47017 | 47772 | 44553 | 32868 | 18146 |
7 rows × 24 columns
plt.figure(figsize=(15,6))
sns.heatmap(pivot)
<AxesSubplot:xlabel='hour', ylabel='weekday'>
def heatmap(col1,col2):
by_cross = df.groupby([col1,col2]).apply(count_rows)
pivot = by_cross.unstack()
plt.figure(figsize=(15,6))
return sns.heatmap(pivot)
heatmap('day','hour')
<AxesSubplot:xlabel='hour', ylabel='day'>
plt.figure(figsize = (12,6))
plt.plot(df['Lon'],df['Lat'],'r+',ms = 0.5)
plt.xlim(-74.2,-73.7)
plt.ylim(40.6,41)
(40.6, 41.0)
df_out = df[df['weekday'] =='Sunday']
df
| Date/Time | Lat | Lon | Base | day | minute | month | year | weekday | hour | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2014-04-01 00:11:00 | 40.7690 | -73.9549 | B02512 | 1 | 11 | 4 | 2014 | Tuesday | 0 |
| 1 | 2014-04-01 00:17:00 | 40.7267 | -74.0345 | B02512 | 1 | 17 | 4 | 2014 | Tuesday | 0 |
| 2 | 2014-04-01 00:21:00 | 40.7316 | -73.9873 | B02512 | 1 | 21 | 4 | 2014 | Tuesday | 0 |
| 3 | 2014-04-01 00:28:00 | 40.7588 | -73.9776 | B02512 | 1 | 28 | 4 | 2014 | Tuesday | 0 |
| 4 | 2014-04-01 00:33:00 | 40.7594 | -73.9722 | B02512 | 1 | 33 | 4 | 2014 | Tuesday | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1028131 | 2014-09-30 22:57:00 | 40.7668 | -73.9845 | B02764 | 30 | 57 | 9 | 2014 | Tuesday | 22 |
| 1028132 | 2014-09-30 22:57:00 | 40.6911 | -74.1773 | B02764 | 30 | 57 | 9 | 2014 | Tuesday | 22 |
| 1028133 | 2014-09-30 22:58:00 | 40.8519 | -73.9319 | B02764 | 30 | 58 | 9 | 2014 | Tuesday | 22 |
| 1028134 | 2014-09-30 22:58:00 | 40.7081 | -74.0066 | B02764 | 30 | 58 | 9 | 2014 | Tuesday | 22 |
| 1028135 | 2014-09-30 22:58:00 | 40.7140 | -73.9496 | B02764 | 30 | 58 | 9 | 2014 | Tuesday | 22 |
4534327 rows × 10 columns
df_out.shape
(490180, 10)
df_out.head()
| Date/Time | Lat | Lon | Base | day | minute | month | year | weekday | hour | |
|---|---|---|---|---|---|---|---|---|---|---|
| 6965 | 2014-04-06 00:00:00 | 40.6547 | -74.3033 | B02512 | 6 | 0 | 4 | 2014 | Sunday | 0 |
| 6966 | 2014-04-06 00:00:00 | 40.7356 | -74.0006 | B02512 | 6 | 0 | 4 | 2014 | Sunday | 0 |
| 6967 | 2014-04-06 00:00:00 | 40.7421 | -74.0041 | B02512 | 6 | 0 | 4 | 2014 | Sunday | 0 |
| 6968 | 2014-04-06 00:00:00 | 40.7401 | -74.0053 | B02512 | 6 | 0 | 4 | 2014 | Sunday | 0 |
| 6969 | 2014-04-06 00:01:00 | 40.7368 | -73.9877 | B02512 | 6 | 1 | 4 | 2014 | Sunday | 0 |
rush = df_out.groupby(['Lat','Lon'])['weekday'].count().reset_index()
rush
| Lat | Lon | weekday | |
|---|---|---|---|
| 0 | 39.9374 | -74.0722 | 1 |
| 1 | 39.9378 | -74.0721 | 1 |
| 2 | 39.9384 | -74.0742 | 1 |
| 3 | 39.9385 | -74.0734 | 1 |
| 4 | 39.9415 | -74.0736 | 1 |
| ... | ... | ... | ... |
| 209225 | 41.3141 | -74.1249 | 1 |
| 209226 | 41.3180 | -74.1298 | 1 |
| 209227 | 41.3195 | -73.6905 | 1 |
| 209228 | 41.3197 | -73.6903 | 1 |
| 209229 | 42.1166 | -72.0666 | 1 |
209230 rows × 3 columns
rush.coloumns = ['Lat','Lon','no of trips']
rush
<ipython-input-105-ba1cc865827f>:1: UserWarning: Pandas doesn't allow columns to be created via a new attribute name - see https://pandas.pydata.org/pandas-docs/stable/indexing.html#attribute-access
| Lat | Lon | weekday | |
|---|---|---|---|
| 0 | 39.9374 | -74.0722 | 1 |
| 1 | 39.9378 | -74.0721 | 1 |
| 2 | 39.9384 | -74.0742 | 1 |
| 3 | 39.9385 | -74.0734 | 1 |
| 4 | 39.9415 | -74.0736 | 1 |
| ... | ... | ... | ... |
| 209225 | 41.3141 | -74.1249 | 1 |
| 209226 | 41.3180 | -74.1298 | 1 |
| 209227 | 41.3195 | -73.6905 | 1 |
| 209228 | 41.3197 | -73.6903 | 1 |
| 209229 | 42.1166 | -72.0666 | 1 |
209230 rows × 3 columns
ERROR: Could not find a version that satisfies the requirement foilum (from versions: none) ERROR: No matching distribution found for foilum
import folium
from folium.plugins import HeatMap
baseMap = folium.Map()
baseMap
HeatMap(rush, zoom = 20, radius = 15).add_to(baseMap)
baseMap